#Import necessary libraries
import re
import copy
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
#Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
nltk.download('stopwords')
nltk.download('wordnet')
#Ignore Warning
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\user\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\user\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
In this project, the dataset that I used for fake news detection is open-sourced and can be found on Kaggle.com: https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification
# load and read dataset
news_data = pd.read_csv("dataset/WELFake_Dataset.csv")
news_data.head()
| Unnamed: 0 | title | text | label | |
|---|---|---|---|---|
| 0 | 0 | LAW ENFORCEMENT ON HIGH ALERT Following Threat... | No comment is expected from Barack Obama Membe... | 1 |
| 1 | 1 | NaN | Did they post their votes for Hillary already? | 1 |
| 2 | 2 | UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO... | Now, most of the demonstrators gathered last ... | 1 |
| 3 | 3 | Bobby Jindal, raised Hindu, uses story of Chri... | A dozen politically active pastors came here f... | 0 |
| 4 | 4 | SATAN 2: Russia unvelis an image of its terrif... | The RS-28 Sarmat missile, dubbed Satan 2, will... | 1 |
news_data.tail()
| Unnamed: 0 | title | text | label | |
|---|---|---|---|---|
| 72129 | 72129 | Russians steal research on Trump in hack of U.... | WASHINGTON (Reuters) - Hackers believed to be ... | 0 |
| 72130 | 72130 | WATCH: Giuliani Demands That Democrats Apolog... | You know, because in fantasyland Republicans n... | 1 |
| 72131 | 72131 | Migrants Refuse To Leave Train At Refugee Camp... | Migrants Refuse To Leave Train At Refugee Camp... | 0 |
| 72132 | 72132 | Trump tussle gives unpopular Mexican leader mu... | MEXICO CITY (Reuters) - Donald Trump’s combati... | 0 |
| 72133 | 72133 | Goldman Sachs Endorses Hillary Clinton For Pre... | Goldman Sachs Endorses Hillary Clinton For Pre... | 1 |
news_data.info(memory_usage = True, verbose = True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 72134 entries, 0 to 72133 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 72134 non-null int64 1 title 71576 non-null object 2 text 72095 non-null object 3 label 72134 non-null int64 dtypes: int64(2), object(2) memory usage: 2.2+ MB
#check duplicated data
news_data.duplicated().sum()
0
# counts the number of occurrences of each unique value in the "label" column (before)
news_data["label"].value_counts()
1 37106 0 35028 Name: label, dtype: int64
# checking the number of null/missing values in the dataset
news_data.isnull().sum()
Unnamed: 0 0 title 558 text 39 label 0 dtype: int64
# drop missing values
news_data=news_data.dropna()
# counts the number of occurrences of each unique value in the "label" column (after)
news_data["label"].value_counts()
1 36509 0 35028 Name: label, dtype: int64
news_data.shape
(71537, 4)
# Create bar plot of label frequencies
plt.figure(figsize=[10, 6])
sns.set_style("whitegrid")
ax = sns.barplot(x=news_data["label"].value_counts().index,
y=news_data["label"].value_counts(),
palette="Blues_r",
saturation=1)
# Set plot title and axis labels
plt.title("Class frequencies of the dataset (real - 1, fake - 0)", fontsize=16)
plt.xlabel("Label", fontsize=14)
plt.ylabel("Count", fontsize=14)
# Set font sizes for x and y tick labels
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Display plot
plt.show()
# Create pie chart of label counts
fig = px.pie(data_frame=news_data,
names="label",
hole=0.4,
title="Distribution of Labels (Real - 1, Fake - 0)",
width=1000,
height=500,
color_discrete_sequence=px.colors.sequential.Sunset_r,
labels={"label": "Label", "": "Count"})
# Add text labels showing count and percentage for each label
fig.update_traces(textposition="inside",
textinfo="label+percent",
marker=dict(line=dict(width=1.2, color="#000000")))
# Add legend to the chart
fig.update_layout(legend=dict(title=None, orientation="h", y=0.8, yanchor="bottom", x=0.8),
title_x=0.5,
title_font=dict(size=30),
uniformtext_minsize=25)
# Show the chart
fig.show()
# Get text for "real" news articles
real_text = " ".join(i for i in news_data[news_data.label == 1].text)
# Generate word cloud
wc = WordCloud(background_color="white",
width=1200,
height=600,
contour_width=0,
contour_color="red",
max_words=1000,
scale=1,
collocations=False,
repeat=True,
min_font_size=1)
wc.generate(real_text)
# Plot the word cloud
plt.figure(figsize=[15, 7])
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Real News Articles", fontsize=20)
plt.tight_layout(pad=0)
plt.show()
# Get text for "fake" news articles
fake_text = " ".join(i for i in news_data[news_data.label == 0].text)
# Generate word cloud
wc = WordCloud(background_color="white",
width=1200,
height=600,
contour_width=0,
contour_color="red",
max_words=1000,
scale=1,
collocations=False,
repeat=True,
min_font_size=1)
wc.generate(fake_text)
# Plot the word cloud
plt.figure(figsize=[15, 7])
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Fake News Articles", fontsize=20)
plt.tight_layout(pad=0)
plt.show()
In this project, we have various versions of data pre-processing.
| Version | Data Pre-Processing Steps |
|---|---|
| No data pre-processing | |
| Version 1 | Remove HTML Tags Remove punctuation marks Remove numbers Remove single characters Remove multiple spaces Remove a single quote Strip out non-alphanumeric characters Tokenization |
| Version 2 | Version 1 Tokenization Remove Stopwords Stemming |
| Version 3 | Version 2 Lowercase Tokenization Lemmatization |
# Remove unnecessary columns
news_data.drop(['Unnamed: 0'], axis=1, inplace=True)
#Combine the title with the text
news_data['text'] = news_data['title'] + news_data['text']
news_data.drop(['title'], axis=1, inplace=True)
news_data.head()
| text | label | |
|---|---|---|
| 0 | LAW ENFORCEMENT ON HIGH ALERT Following Threat... | 1 |
| 2 | UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO... | 1 |
| 3 | Bobby Jindal, raised Hindu, uses story of Chri... | 0 |
| 4 | SATAN 2: Russia unvelis an image of its terrif... | 1 |
| 5 | About Time! Christian Group Sues Amazon and SP... | 1 |
news_data.shape
(71537, 2)
# text with no data pre-processing
news_data.head(n = 5).style.background_gradient(cmap = "Blues")
| text | label | |
|---|---|---|
| 0 | LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9-11By #BlackLivesMatter And #FYF911 Terrorists [VIDEO]No comment is expected from Barack Obama Members of the #FYF911 or #FukYoFlag and #BlackLivesMatter movements called for the lynching and hanging of white people and cops. They encouraged others on a radio show Tuesday night to turn the tide and kill white people and cops to send a message about the killing of black people in America.One of the F***YoFlag organizers is called Sunshine. She has a radio blog show hosted from Texas called, Sunshine s F***ing Opinion Radio Show. A snapshot of her #FYF911 @LOLatWhiteFear Twitter page at 9:53 p.m. shows that she was urging supporters to Call now!! #fyf911 tonight we continue to dismantle the illusion of white Below is a SNAPSHOT Twitter Radio Call Invite #FYF911The radio show aired at 10:00 p.m. eastern standard time.During the show, callers clearly call for lynching and killing of white people.A 2:39 minute clip from the radio show can be heard here. It was provided to Breitbart Texas by someone who would like to be referred to as Hannibal. He has already received death threats as a result of interrupting #FYF911 conference calls.An unidentified black man said when those mother f**kers are by themselves, that s when when we should start f***ing them up. Like they do us, when a bunch of them ni**ers takin one of us out, that s how we should roll up. He said, Cause we already roll up in gangs anyway. There should be six or seven black mother f**ckers, see that white person, and then lynch their ass. Let s turn the tables. They conspired that if cops started losing people, then there will be a state of emergency. He speculated that one of two things would happen, a big-ass [R s?????] war, or ni**ers, they are going to start backin up. We are already getting killed out here so what the f**k we got to lose? Sunshine could be heard saying, Yep, that s true. That s so f**king true. He said, We need to turn the tables on them. Our kids are getting shot out here. Somebody needs to become a sacrifice on their side.He said, Everybody ain t down for that s**t, or whatever, but like I say, everybody has a different position of war. He continued, Because they don t give a f**k anyway. He said again, We might as well utilized them for that s**t and turn the tables on these n**ers. He said, that way we can start lookin like we ain t havin that many casualties, and there can be more causalities on their side instead of ours. They are out their killing black people, black lives don t matter, that s what those mother f**kers so we got to make it matter to them. Find a mother f**ker that is alone. Snap his ass, and then f***in hang him from a damn tree. Take a picture of it and then send it to the mother f**kers. We just need one example, and then people will start watchin . This will turn the tables on s**t, he said. He said this will start a trickle-down effect. He said that when one white person is hung and then they are just flat-hanging, that will start the trickle-down effect. He continued, Black people are good at starting trends. He said that was how to get the upper-hand. Another black man spoke up saying they needed to kill cops that are killing us. The first black male said, That will be the best method right there. Breitbart Texas previously reported how Sunshine was upset when racist white people infiltrated and disrupted one of her conference calls. She subsequently released the phone number of one of the infiltrators. The veteran immediately started receiving threatening calls.One of the #F***YoFlag movement supporters allegedly told a veteran who infiltrated their publicly posted conference call, We are going to rape and gut your pregnant wife, and your f***ing piece of sh*t unborn creature will be hung from a tree. Breitbart Texas previously encountered Sunshine at a Sandra Bland protest at the Waller County Jail in Texas, where she said all white people should be killed. She told journalists and photographers, You see this nappy-ass hair on my head? That means I am one of those more militant Negroes. She said she was at the protest because these redneck mother-f**kers murdered Sandra Bland because she had nappy hair like me. #FYF911 black radicals say they will be holding the imperial powers that are actually responsible for the terrorist attacks on September 11th accountable on that day, as reported by Breitbart Texas. There are several websites and Twitter handles for the movement. Palmetto Star describes himself as one of the head organizers. He said in a YouTube video that supporters will be burning their symbols of the illusion of their superiority, their false white supremacy, like the American flag, the British flag, police uniforms, and Ku Klux Klan hoods.Sierra McGrone or Nocturnus Libertus posted, you too can help a young Afrikan clean their a** with the rag of oppression. She posted two photos, one that appears to be herself, and a photo of a black man, wiping their naked butts with the American flag.For entire story: Breitbart News | 1 |
| 2 | UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO] Now, most of the demonstrators gathered last night were exercising their constitutional and protected right to peaceful protest in order to raise issues and create change. Loretta Lynch aka Eric Holder in a skirt | 1 |
| 3 | Bobby Jindal, raised Hindu, uses story of Christian conversion to woo evangelicals for potential 2016 bidA dozen politically active pastors came here for a private dinner Friday night to hear a conversion story unique in the context of presidential politics: how Louisiana Gov. Bobby Jindal traveled from Hinduism to Protestant Christianity and, ultimately, became what he calls an “evangelical Catholic.” Over two hours, Jindal, 42, recalled talking with a girl in high school who wanted to “save my soul,” reading the Bible in a closet so his parents would not see him and feeling a stir while watching a movie during his senior year that depicted Jesus on the cross. “I was struck, and struck hard,” Jindal told the pastors. “This was the Son of God, and He had died for our sins.” Jindal’s session with the Christian clergy, who lead congregations in the early presidential battleground states of Iowa and South Carolina, was part of a behind-the-scenes effort by the Louisiana governor to find a political base that could help propel him into the top tier of Republican candidates seeking to run for the White House in 2016. Known in GOP circles mostly for his mastery of policy issues such as health care, Jindal, a Rhodes Scholar and graduate of the Ivy League’s Brown University, does not have an obvious pool of activist supporters to help drive excitement outside his home state. So he is harnessing his religious experience in a way that has begun to appeal to parts of the GOP’s influential core of religious conservatives, many of whom have yet to find a favorite among the Republicans eyeing the presidential race. Other potential 2016 GOP candidates are wooing the evangelical base, including Sens. Rand Paul (Ky.) and Ted Cruz (Tex.) and Indiana Gov. Mike Pence. But over the weekend in Lynchburg — a mecca of sorts for evangelicals as the home of Liberty University, founded in the 1970s by the Rev. Jerry Falwell — Jindal appeared to make progress. In addition to his dinner with the pastors, he delivered a well-received “call to action” address to 40,000 Christian conservatives gathered for Liberty’s commencement ceremony, talking again about his faith while assailing what he said was President Obama’s record of attacking religious liberty. The pastors who came to meet Jindal said his intimate descriptions of his experiences stood out. “He has the convictions, and he has what it takes to communicate them,” said Brad Sherman of Solid Rock Christian Church in Coralville, Iowa. Sherman helped former Arkansas governor Mike Huckabee in his winning 2008 campaign for delegates in Iowa. Another Huckabee admirer, the Rev. C. Mitchell Brooks of Second Baptist Church in Belton, S.C., said Jindal’s commitment to Christian values and his compelling story put him “on a par” with Huckabee, who was a Baptist preacher before entering politics. The visiting pastors flew to Lynchburg over the weekend at the invitation of the American Renewal Project, a well-funded nonprofit group that encourages evangelical Christians to engage in the civic arena with voter guides, get-out-the-vote drives and programs to train pastors in grass-roots activism. The group’s founder, David Lane, has built a pastor network in politically important states such as Iowa, Missouri, Ohio and South Carolina and has led trips to Israel with Paul and others seeking to make inroads with evangelical activists. The group that Lane invited to Lynchburg included Donald Wildmon, a retired minister and founder of the American Family Association, a prominent evangelical activist group that has influence through its network of more than 140 Christian radio stations. Most of the pastors that Lane’s organization brought to Lynchburg had not met Jindal. But they said he captured their interest recently when he stepped forward to defend Phil Robertson, patriarch of the “Duck Dynasty” television-show family, amid a controversy over disparaging remarks he made about gays in an interview with GQ magazine. Throughout his Lynchburg visit, Jindal presented himself as a willing culture warrior. During his commencement address Saturday, he took up the cause of twin brothers whose HGTV reality series about renovating and reselling houses, “Flip It Forward,” was canceled last week after a Web site revealed that they had protested against same-sex marriage at the 2012 Democratic National Convention in Charlotte. The siblings, Jason and David Benham, both Liberty graduates, attended the graduation and a private lunch with Jindal, who called the action against them “another demonstration of intolerance from the entertainment industry.” “If these guys had protested at the Republican Party convention, instead of canceling their show, HGTV would probably have given them a raise,” Jindal said as the Liberty crowd applauded. He cited the Hobby Lobby craft store chain, which faced a legal challenge after refusing to provide employees with insurance coverage for contraceptives as required under the Affordable Care Act. Members of the family that owns Hobby Lobby, who have become heroes to many religious conservatives, have said that they are morally opposed to the use of certain types of birth control and that they considered the requirement a violation of their First Amendment right to religious freedom. The family was “committed to honor the Lord by being generous employers, paying well above minimum wage and increasing salaries four years in a row even in the midst of the enduring recession,” Jindal told the Liberty graduates. “None of this matters to the Obama administration.” But for the pastors who came to see Jindal in action, the governor’s own story was the highlight of the weekend. And in many ways, he was unlike any other aspiring president these activists had met. Piyush Jindal was born in 1971, four months after his parents arrived in Baton Rouge, La., from their native India. He changed his name to Bobby as a young boy, adopting the name of a character on a favorite television show, “The Brady Bunch.” His decision to become a Christian, he told the pastors, did not come in one moment of lightning epiphany. Instead, he said, it happened in phases, growing from small seeds planted over time. Jindal recalled that his closest friend from grade school gave him a Bible with his name emblazoned in gold on the cover as a Christmas present. It struck him initially as an unimpressive gift, Jindal told the pastors. “Who in the world would spend good money for a Bible when everyone knows you can get one free in any hotel?” he recalled thinking at the time. “And the gold lettering meant I couldn’t give it away or return it.” His religious education reached a higher plane during his junior year in high school, he told his dinner audience. He wanted to ask a pretty girl on a date during a hallway conversation, and she started talking about her faith in God and her opposition to abortion. The girl invited him to visit her church. Jindal said he was skeptical and set out to “investigate all these fanciful claims” made by the girl and other friends. He started reading the Bible in his closet at home. “I was unsure how my parents would react,” he said. After the stirring moment when he saw Christ depicted on the cross during the religious movie, the Bible and his very existence suddenly seemed clearer to him, Jindal told the pastors. Jindal did not dwell on his subsequent conversion to Catholicism just a few years later in college, where he said he immersed himself in the traditions of the church. He touched on it briefly during the commencement address, noting in passing that “I am best described as an evangelical Catholic.” Mostly, he sought to showcase the ways in which he shares values with other Christian conservatives. “I read the words of Jesus Christ, and I realized that they were true,” Jindal told the graduates Saturday, offering a less detailed accounting of his conversion than he had done the night before with the pastors. “I used to think that I had found God, but I believe it is more accurate to say that He found me.” | 0 |
| 4 | SATAN 2: Russia unvelis an image of its terrifying new ‘SUPERNUKE’ – Western world takes noticeThe RS-28 Sarmat missile, dubbed Satan 2, will replace the SS-18 Flies at 4.3 miles (7km) per sec and with a range of 6,213 miles (10,000km) The weapons are perceived as part of an increasingly aggressive Russia It could deliver a warhead of 40 megatons – 2,000 times as powerful as the atom bombs dropped on Hiroshima and Nagasaki in 1945 By LIBBY PLUMMER and GARETH DAVIE S Russia has unveiled chilling pictures of its largest ever nuclear missile, capable of destroying an area the size of France. The RS-28 Sarmat missile, dubbed Satan 2 by Nato, has a top speed of 4.3 miles (7km) per second and has been designed to outfox anti-missile shield systems. The new Sarmat missile could deliver warheads of 40 megatons – 2,000 times as powerful as the atom bombs dropped on Hiroshima and Nagasaki in 1945. Scroll down for video Russian President Vladimir Putin is reportedly planning to replace the country’s older SS-18 Satan weapons with the new missiles amid a string of recent disagreements with the West. The Kremlin has stepped up the rhetoric against the West and carried a series of manoeuvres that has infuriated politicians in the US and UK. The pictures were revealed online by chief designers from the Makeyev Rocket Design Bureau. A message posted alongside the picture said: ‘In accordance with the Decree of the Russian Government ‘On the State Defense Order for 2010 and the planning period 2012-2013’, the Makeyev Rocket Design Bureau was instructed to start design and development work on the Sarmat. ‘ The RS-28 Sarmat missile is said to contain 16 nuclear warheads and is capable of destroying an area the size of France or Texas, according to Russian news network Zvezda, which is owned by Russia’s ministry of defence. The weapon is also able to evade radar. It is expected to have a range of 6,213 miles (10,000 km), which would allow Moscow to attack London and FOR ENTIRE ARTICLE CLICK LINK | 1 |
| 5 | About Time! Christian Group Sues Amazon and SPLC for Designation as Hate GroupAll we can say on this one is it s about time someone sued the Southern Poverty Law Center!On Tuesday, D. James Kennedy Ministries (DJKM) filed a lawsuit against the Southern Poverty Law Center (SPLC), the charity navigation organization GuideStar, and Amazon, for defamation, religious discrimination, and trafficking in falsehood. The SPLC listed DJKM as a hate group, while GuideStar also categorized it in those terms, and Amazon kept the ministry off of its charity donation program, Amazon Smile. We embarked today on a journey to right a terrible wrong, Dr. Frank Wright, president and CEO at DJKM, said in a statement Tuesday. Those who knowingly label Christian ministries as hate groups, solely for subscribing to the historic Christian faith, are either woefully uninformed or willfully deceitful. In the case of the Southern Poverty Law Center, our lawsuit alleges the latter. The SPLC has labeled DJKM an anti-LGBT hate group for its opposition to same-sex marriage and transgenderism. These false and illegal characterizations have a chilling effect on the free exercise of religion and on religious free speech for all people of faith, Wright declared. After having given the SPLC an opportunity to retract, we have undertaken this legal action, seeking a trial by a jury of our peers, to preserve our own rights under the law and to defend the religious free speech rights of all Americans, the DJKM president concluded.The lawsuit laid out charges against the SPLC, GuideStar, and Amazon.Read more: PJM | 1 |
# copy data to different version data
news_data_v1 = news_data.copy()
news_data_v2 = news_data.copy()
news_data_v3 = news_data.copy()
def preprocessing_v1(text):
"""
Cleans a string of text by removing HTML tags, punctuation marks, numbers, single characters,
multiple spaces, single quotes, and non-alphanumeric characters.The final output will returns the cleaned text.
"""
#remove HTML Tags
text = re.sub(r'https?://\S+|www\.\S+', '', text)
#remove punctuations marks
text = re.sub(r'[^\w\s]',' ', text)
#remove numbers
text = re.sub(r'\d','', text)
#remove single character
text = re.sub(r'\b[a-zA-Z]\b', '', text)
#remove multiple spaces
text = re.sub(r' +', ' ', text)
#remove single quote
text = re.sub(r"'",'', text)
#remove non-alphanumeric characters
text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
#tokenization
words = nltk.word_tokenize(text)
text = ' '.join([word for word in words])
return text
# apply data preprocessing(version 1)
news_data_v1['text'] = news_data_v1['text'].apply(preprocessing_v1)
# cleaned text after apply version 1 data preprocessing
news_data_v1.head(n = 5).style.background_gradient(cmap = "Blues")
| text | label | |
|---|---|---|
| 0 | LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On By BlackLivesMatter And FYF Terrorists VIDEO No comment is expected from Barack Obama Members of the FYF or FukYoFlag and BlackLivesMatter movements called for the lynching and hanging of white people and cops They encouraged others on radio show Tuesday night to turn the tide and kill white people and cops to send message about the killing of black people in America One of the YoFlag organizers is called Sunshine She has radio blog show hosted from Texas called Sunshine ing Opinion Radio Show snapshot of her FYF LOLatWhiteFear Twitter page at shows that she was urging supporters to Call now fyf tonight we continue to dismantle the illusion of white Below is SNAPSHOT Twitter Radio Call Invite FYFThe radio show aired at eastern standard time During the show callers clearly call for lynching and killing of white people minute clip from the radio show can be heard here It was provided to Breitbart Texas by someone who would like to be referred to as Hannibal He has already received death threats as result of interrupting FYF conference calls An unidentified black man said when those mother kers are by themselves that when when we should start ing them up Like they do us when bunch of them ni ers takin one of us out that how we should roll up He said Cause we already roll up in gangs anyway There should be six or seven black mother ckers see that white person and then lynch their ass Let turn the tables They conspired that if cops started losing people then there will be state of emergency He speculated that one of two things would happen big ass war or ni ers they are going to start backin up We are already getting killed out here so what the we got to lose Sunshine could be heard saying Yep that true That so king true He said We need to turn the tables on them Our kids are getting shot out here Somebody needs to become sacrifice on their side He said Everybody ain down for that or whatever but like say everybody has different position of war He continued Because they don give anyway He said again We might as well utilized them for that and turn the tables on these ers He said that way we can start lookin like we ain havin that many casualties and there can be more causalities on their side instead of ours They are out their killing black people black lives don matter that what those mother kers so we got to make it matter to them Find mother ker that is alone Snap his ass and then in hang him from damn tree Take picture of it and then send it to the mother kers We just need one example and then people will start watchin This will turn the tables on he said He said this will start trickle down effect He said that when one white person is hung and then they are just flat hanging that will start the trickle down effect He continued Black people are good at starting trends He said that was how to get the upper hand Another black man spoke up saying they needed to kill cops that are killing us The first black male said That will be the best method right there Breitbart Texas previously reported how Sunshine was upset when racist white people infiltrated and disrupted one of her conference calls She subsequently released the phone number of one of the infiltrators The veteran immediately started receiving threatening calls One of the YoFlag movement supporters allegedly told veteran who infiltrated their publicly posted conference call We are going to rape and gut your pregnant wife and your ing piece of sh unborn creature will be hung from tree Breitbart Texas previously encountered Sunshine at Sandra Bland protest at the Waller County Jail in Texas where she said all white people should be killed She told journalists and photographers You see this nappy ass hair on my head That means am one of those more militant Negroes She said she was at the protest because these redneck mother kers murdered Sandra Bland because she had nappy hair like me FYF black radicals say they will be holding the imperial powers that are actually responsible for the terrorist attacks on September th accountable on that day as reported by Breitbart Texas There are several websites and Twitter handles for the movement Palmetto Star describes himself as one of the head organizers He said in YouTube video that supporters will be burning their symbols of the illusion of their superiority their false white supremacy like the American flag the British flag police uniforms and Ku Klux Klan hoods Sierra McGrone or Nocturnus Libertus posted you too can help young Afrikan clean their with the rag of oppression She posted two photos one that appears to be herself and photo of black man wiping their naked butts with the American flag For entire story Breitbart News | 1 |
| 2 | UNBELIEVABLE OBAMA ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE PEACEFUL PROTESTERS In Her Home State Of North Carolina VIDEO Now most of the demonstrators gathered last night were exercising their constitutional and protected right to peaceful protest in order to raise issues and create change Loretta Lynch aka Eric Holder in skirt | 1 |
| 3 | Bobby Jindal raised Hindu uses story of Christian conversion to woo evangelicals for potential bidA dozen politically active pastors came here for private dinner Friday night to hear conversion story unique in the context of presidential politics how Louisiana Gov Bobby Jindal traveled from Hinduism to Protestant Christianity and ultimately became what he calls an evangelical Catholic Over two hours Jindal recalled talking with girl in high school who wanted to save my soul reading the Bible in closet so his parents would not see him and feeling stir while watching movie during his senior year that depicted Jesus on the cross was struck and struck hard Jindal told the pastors This was the Son of God and He had died for our sins Jindal session with the Christian clergy who lead congregations in the early presidential battleground states of Iowa and South Carolina was part of behind the scenes effort by the Louisiana governor to find political base that could help propel him into the top tier of Republican candidates seeking to run for the White House in Known in GOP circles mostly for his mastery of policy issues such as health care Jindal Rhodes Scholar and graduate of the Ivy League Brown University does not have an obvious pool of activist supporters to help drive excitement outside his home state So he is harnessing his religious experience in way that has begun to appeal to parts of the GOP influential core of religious conservatives many of whom have yet to find favorite among the Republicans eyeing the presidential race Other potential GOP candidates are wooing the evangelical base including Sens Rand Paul Ky and Ted Cruz Tex and Indiana Gov Mike Pence But over the weekend in Lynchburg mecca of sorts for evangelicals as the home of Liberty University founded in the by the Rev Jerry Falwell Jindal appeared to make progress In addition to his dinner with the pastors he delivered well received call to action address to Christian conservatives gathered for Liberty commencement ceremony talking again about his faith while assailing what he said was President Obama record of attacking religious liberty The pastors who came to meet Jindal said his intimate descriptions of his experiences stood out He has the convictions and he has what it takes to communicate them said Brad Sherman of Solid Rock Christian Church in Coralville Iowa Sherman helped former Arkansas governor Mike Huckabee in his winning campaign for delegates in Iowa Another Huckabee admirer the Rev Mitchell Brooks of Second Baptist Church in Belton said Jindal commitment to Christian values and his compelling story put him on par with Huckabee who was Baptist preacher before entering politics The visiting pastors flew to Lynchburg over the weekend at the invitation of the American Renewal Project well funded nonprofit group that encourages evangelical Christians to engage in the civic arena with voter guides get out the vote drives and programs to train pastors in grass roots activism The group founder David Lane has built pastor network in politically important states such as Iowa Missouri Ohio and South Carolina and has led trips to Israel with Paul and others seeking to make inroads with evangelical activists The group that Lane invited to Lynchburg included Donald Wild mon retired minister and founder of the American Family Association prominent evangelical activist group that has influence through its network of more than Christian radio stations Most of the pastors that Lane organization brought to Lynchburg had not met Jindal But they said he captured their interest recently when he stepped forward to defend Phil Robertson patriarch of the Duck Dynasty television show family amid controversy over disparaging remarks he made about gays in an interview with GQ magazine Throughout his Lynchburg visit Jindal presented himself as willing culture warrior During his commencement address Saturday he took up the cause of twin brothers whose HGTV reality series about renovating and reselling houses Flip It Forward was canceled last week after Web site revealed that they had protested against same sex marriage at the Democratic National Convention in Charlotte The siblings Jason and David Benham both Liberty graduates attended the graduation and private lunch with Jindal who called the action against them another demonstration of intolerance from the entertainment industry If these guys had protested at the Republican Party convention instead of canceling their show HGTV would probably have given them raise Jindal said as the Liberty crowd applauded He cited the Hobby Lobby craft store chain which faced legal challenge after refusing to provide employees with insurance coverage for contraceptives as required under the Affordable Care Act Members of the family that owns Hobby Lobby who have become heroes to many religious conservatives have said that they are morally opposed to the use of certain types of birth control and that they considered the requirement violation of their First Amendment right to religious freedom The family was committed to honor the Lord by being generous employers paying well above minimum wage and increasing salaries four years in row even in the midst of the enduring recession Jindal told the Liberty graduates None of this matters to the Obama administration But for the pastors who came to see Jindal in action the governor own story was the highlight of the weekend And in many ways he was unlike any other aspiring president these activists had met Piyush Jindal was born in four months after his parents arrived in Baton Rouge La from their native India He changed his name to Bobby as young boy adopting the name of character on favorite television show The Brady Bunch His decision to become Christian he told the pastors did not come in one moment of lightning epiphany Instead he said it happened in phases growing from small seeds planted over time Jindal recalled that his closest friend from grade school gave him Bible with his name emblazoned in gold on the cover as Christmas present It struck him initially as an unimpressive gift Jindal told the pastors Who in the world would spend good money for Bible when everyone knows you can get one free in any hotel he recalled thinking at the time And the gold lettering meant couldn give it away or return it His religious education reached higher plane during his junior year in high school he told his dinner audience He wanted to ask pretty girl on date during hallway conversation and she started talking about her faith in God and her opposition to abortion The girl invited him to visit her church Jindal said he was skeptical and set out to investigate all these fanciful claims made by the girl and other friends He started reading the Bible in his closet at home was unsure how my parents would react he said After the stirring moment when he saw Christ depicted on the cross during the religious movie the Bible and his very existence suddenly seemed clearer to him Jindal told the pastors Jindal did not dwell on his subsequent conversion to Catholicism just few years later in college where he said he immersed himself in the traditions of the church He touched on it briefly during the commencement address noting in passing that am best described as an evangelical Catholic Mostly he sought to showcase the ways in which he shares values with other Christian conservatives read the words of Jesus Christ and realized that they were true Jindal told the graduates Saturday offering less detailed accounting of his conversion than he had done the night before with the pastors used to think that had found God but believe it is more accurate to say that He found me | 0 |
| 4 | SATAN Russia unvelis an image of its terrifying new SUPERNUKE Western world takes noticeThe RS Sarmat missile dubbed Satan will replace the SS Flies at miles km per sec and with range of miles km The weapons are perceived as part of an increasingly aggressive Russia It could deliver warhead of megatons times as powerful as the atom bombs dropped on Hiroshima and Nagasaki in By LIBBY PLUMMER and GARETH DAVIE Russia has unveiled chilling pictures of its largest ever nuclear missile capable of destroying an area the size of France The RS Sarmat missile dubbed Satan by Nato has top speed of miles km per second and has been designed to outfox anti missile shield systems The new Sarmat missile could deliver warheads of megatons times as powerful as the atom bombs dropped on Hiroshima and Nagasaki in Scroll down for video Russian President Vladimir Putin is reportedly planning to replace the country older SS Satan weapons with the new missiles amid string of recent disagreements with the West The Kremlin has stepped up the rhetoric against the West and carried series of manoeuvres that has infuriated politicians in the US and UK The pictures were revealed online by chief designers from the Makeyev Rocket Design Bureau message posted alongside the picture said In accordance with the Decree of the Russian Government On the State Defense Order for and the planning period the Makeyev Rocket Design Bureau was instructed to start design and development work on the Sarmat The RS Sarmat missile is said to contain nuclear warheads and is capable of destroying an area the size of France or Texas according to Russian news network Zvezda which is owned by Russia ministry of defence The weapon is also able to evade radar It is expected to have range of miles km which would allow Moscow to attack London and FOR ENTIRE ARTICLE CLICK LINK | 1 |
| 5 | About Time Christian Group Sues Amazon and SPLC for Designation as Hate GroupAll we can say on this one is it about time someone sued the Southern Poverty Law Center On Tuesday James Kennedy Ministries DJKM filed lawsuit against the Southern Poverty Law Center SPLC the charity navigation organization GuideStar and Amazon for defamation religious discrimination and trafficking in falsehood The SPLC listed DJKM as hate group while GuideStar also categorized it in those terms and Amazon kept the ministry off of its charity donation program Amazon Smile We embarked today on journey to right terrible wrong Dr Frank Wright president and CEO at DJKM said in statement Tuesday Those who knowingly label Christian ministries as hate groups solely for subscribing to the historic Christian faith are either woefully uninformed or willfully deceitful In the case of the Southern Poverty Law Center our lawsuit alleges the latter The SPLC has labeled DJKM an anti LGBT hate group for its opposition to same sex marriage and transgenderism These false and illegal characterizations have chilling effect on the free exercise of religion and on religious free speech for all people of faith Wright declared After having given the SPLC an opportunity to retract we have undertaken this legal action seeking trial by jury of our peers to preserve our own rights under the law and to defend the religious free speech rights of all Americans the DJKM president concluded The lawsuit laid out charges against the SPLC GuideStar and Amazon Read more PJM | 1 |
def preprocessing_v2(text):
#version 1
text = preprocessing_v1(text)
#Tokenization
tokenized_text = nltk.word_tokenize(text)
# removes stop words
sw = stopwords.words('english')
words = [word for word in tokenized_text if word not in sw]
#Stemming
stemPS = PorterStemmer()
words = [stemPS.stem(word) for word in words]
text = ' '.join([word for word in words])
return text
news_data_v2['text'] = news_data_v2['text'].apply(preprocessing_v2)
# cleaned text after apply version 2 data preprocessing
news_data_v2.head(n = 5).style.background_gradient(cmap = "Blues")
| text | label | |
|---|---|---|
| 0 | law enforc on high alert follow threat against cop and white on by blacklivesmatt and fyf terrorist video no comment expect barack obama member fyf fukyoflag blacklivesmatt movement call lynch hang white peopl cop they encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl america one yoflag organ call sunshin she radio blog show host texa call sunshin ing opinion radio show snapshot fyf lolatwhitefear twitter page show urg support call fyf tonight continu dismantl illus white below snapshot twitter radio call invit fyfth radio show air eastern standard time dure show caller clearli call lynch kill white peopl minut clip radio show heard it provid breitbart texa someon would like refer hannib he alreadi receiv death threat result interrupt fyf confer call an unidentifi black man said mother ker start ing like us bunch ni er takin one us roll he said caus alreadi roll gang anyway there six seven black mother cker see white person lynch ass let turn tabl they conspir cop start lose peopl state emerg he specul one two thing would happen big ass war ni er go start backin we alreadi get kill got lose sunshin could heard say yep true that king true he said we need turn tabl our kid get shot somebodi need becom sacrific side he said everybodi whatev like say everybodi differ posit war he continu becaus give anyway he said we might well util turn tabl er he said way start lookin like havin mani casualti causal side instead they kill black peopl black live matter mother ker got make matter find mother ker alon snap ass hang damn tree take pictur send mother ker we need one exampl peopl start watchin thi turn tabl said he said start trickl effect he said one white person hung flat hang start trickl effect he continu black peopl good start trend he said get upper hand anoth black man spoke say need kill cop kill us the first black male said that best method right breitbart texa previous report sunshin upset racist white peopl infiltr disrupt one confer call she subsequ releas phone number one infiltr the veteran immedi start receiv threaten call one yoflag movement support allegedli told veteran infiltr publicli post confer call we go rape gut pregnant wife ing piec sh unborn creatur hung tree breitbart texa previous encount sunshin sandra bland protest waller counti jail texa said white peopl kill she told journalist photograph you see nappi ass hair head that mean one milit negro she said protest redneck mother ker murder sandra bland nappi hair like fyf black radic say hold imperi power actual respons terrorist attack septemb th account day report breitbart texa there sever websit twitter handl movement palmetto star describ one head organ he said youtub video support burn symbol illus superior fals white supremaci like american flag british flag polic uniform ku klux klan hood sierra mcgrone nocturnu libertu post help young afrikan clean rag oppress she post two photo one appear photo black man wipe nake butt american flag for entir stori breitbart new | 1 |
| 2 | unbeliev obama attorney gener say most charlott rioter were peac protest in her home state of north carolina video now demonstr gather last night exercis constitut protect right peac protest order rais issu creat chang loretta lynch aka eric holder skirt | 1 |
| 3 | bobbi jindal rais hindu use stori christian convers woo evangel potenti bida dozen polit activ pastor came privat dinner friday night hear convers stori uniqu context presidenti polit louisiana gov bobbi jindal travel hinduism protest christian ultim becam call evangel cathol over two hour jindal recal talk girl high school want save soul read bibl closet parent would see feel stir watch movi senior year depict jesu cross struck struck hard jindal told pastor thi son god he die sin jindal session christian clergi lead congreg earli presidenti battleground state iowa south carolina part behind scene effort louisiana governor find polit base could help propel top tier republican candid seek run white hous known gop circl mostli masteri polici issu health care jindal rhode scholar graduat ivi leagu brown univers obviou pool activist support help drive excit outsid home state so har religi experi way begun appeal part gop influenti core religi conserv mani yet find favorit among republican eye presidenti race other potenti gop candid woo evangel base includ sen rand paul ky ted cruz tex indiana gov mike penc but weekend lynchburg mecca sort evangel home liberti univers found rev jerri falwel jindal appear make progress in addit dinner pastor deliv well receiv call action address christian conserv gather liberti commenc ceremoni talk faith assail said presid obama record attack religi liberti the pastor came meet jindal said intim descript experi stood he convict take commun said brad sherman solid rock christian church coralvil iowa sherman help former arkansa governor mike huckabe win campaign deleg iowa anoth huckabe admir rev mitchel brook second baptist church belton said jindal commit christian valu compel stori put par huckabe baptist preacher enter polit the visit pastor flew lynchburg weekend invit american renew project well fund nonprofit group encourag evangel christian engag civic arena voter guid get vote drive program train pastor grass root activ the group founder david lane built pastor network polit import state iowa missouri ohio south carolina led trip israel paul other seek make inroad evangel activist the group lane invit lynchburg includ donald wild mon retir minist founder american famili associ promin evangel activist group influenc network christian radio station most pastor lane organ brought lynchburg met jindal but said captur interest recent step forward defend phil robertson patriarch duck dynasti televis show famili amid controversi disparag remark made gay interview gq magazin throughout lynchburg visit jindal present will cultur warrior dure commenc address saturday took caus twin brother whose hgtv realiti seri renov resel hous flip it forward cancel last week web site reveal protest sex marriag democrat nation convent charlott the sibl jason david benham liberti graduat attend graduat privat lunch jindal call action anoth demonstr intoler entertain industri if guy protest republican parti convent instead cancel show hgtv would probabl given rais jindal said liberti crowd applaud he cite hobbi lobbi craft store chain face legal challeng refus provid employe insur coverag contracept requir afford care act member famili own hobbi lobbi becom hero mani religi conserv said moral oppos use certain type birth control consid requir violat first amend right religi freedom the famili commit honor lord gener employ pay well minimum wage increas salari four year row even midst endur recess jindal told liberti graduat none matter obama administr but pastor came see jindal action governor stori highlight weekend and mani way unlik aspir presid activist met piyush jindal born four month parent arriv baton roug la nativ india he chang name bobbi young boy adopt name charact favorit televis show the bradi bunch hi decis becom christian told pastor come one moment lightn epiphani instead said happen phase grow small seed plant time jindal recal closest friend grade school gave bibl name emblazon gold cover christma present it struck initi unimpress gift jindal told pastor who world would spend good money bibl everyon know get one free hotel recal think time and gold letter meant give away return hi religi educ reach higher plane junior year high school told dinner audienc he want ask pretti girl date hallway convers start talk faith god opposit abort the girl invit visit church jindal said skeptic set investig fanci claim made girl friend he start read bibl closet home unsur parent would react said after stir moment saw christ depict cross religi movi bibl exist suddenli seem clearer jindal told pastor jindal dwell subsequ convers catholic year later colleg said immers tradit church he touch briefli commenc address note pass best describ evangel cathol mostli sought showcas way share valu christian conserv read word jesu christ realiz true jindal told graduat saturday offer less detail account convers done night pastor use think found god believ accur say he found | 0 |
| 4 | satan russia unv imag terrifi new supernuk western world take noticeth rs sarmat missil dub satan replac ss fli mile km per sec rang mile km the weapon perceiv part increasingli aggress russia it could deliv warhead megaton time power atom bomb drop hiroshima nagasaki by libbi plummer gareth davi russia unveil chill pictur largest ever nuclear missil capabl destroy area size franc the rs sarmat missil dub satan nato top speed mile km per second design outfox anti missil shield system the new sarmat missil could deliv warhead megaton time power atom bomb drop hiroshima nagasaki scroll video russian presid vladimir putin reportedli plan replac countri older ss satan weapon new missil amid string recent disagr west the kremlin step rhetor west carri seri manoeuvr infuri politician us uk the pictur reveal onlin chief design makeyev rocket design bureau messag post alongsid pictur said in accord decre russian govern on state defens order plan period makeyev rocket design bureau instruct start design develop work sarmat the rs sarmat missil said contain nuclear warhead capabl destroy area size franc texa accord russian news network zvezda own russia ministri defenc the weapon also abl evad radar it expect rang mile km would allow moscow attack london for entir articl click link | 1 |
| 5 | about time christian group sue amazon splc design hate groupal say one time someon su southern poverti law center on tuesday jame kennedi ministri djkm file lawsuit southern poverti law center splc chariti navig organ guidestar amazon defam religi discrimin traffick falsehood the splc list djkm hate group guidestar also categor term amazon kept ministri chariti donat program amazon smile we embark today journey right terribl wrong dr frank wright presid ceo djkm said statement tuesday those knowingli label christian ministri hate group sole subscrib histor christian faith either woefulli uninform will deceit in case southern poverti law center lawsuit alleg latter the splc label djkm anti lgbt hate group opposit sex marriag transgender these fals illeg character chill effect free exercis religion religi free speech peopl faith wright declar after given splc opportun retract undertaken legal action seek trial juri peer preserv right law defend religi free speech right american djkm presid conclud the lawsuit laid charg splc guidestar amazon read pjm | 1 |
def preprocessing_v3(text):
#version 2
text = preprocessing_v2(text)
tokenized_text = nltk.word_tokenize(text)
# Lemmatize each word in the tokenized text
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in tokenized_text]
text = ' '.join([word for word in words])
return text
news_data_v3['text'] = news_data_v3['text'].apply(preprocessing_v3)
# cleaned text after apply version 3 data preprocessing
news_data_v3.head(n = 5).style.background_gradient(cmap = "Blues")
| text | label | |
|---|---|---|
| 0 | law enforc on high alert follow threat against cop and white on by blacklivesmatt and fyf terrorist video no comment expect barack obama member fyf fukyoflag blacklivesmatt movement call lynch hang white peopl cop they encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl america one yoflag organ call sunshin she radio blog show host texa call sunshin ing opinion radio show snapshot fyf lolatwhitefear twitter page show urg support call fyf tonight continu dismantl illus white below snapshot twitter radio call invit fyfth radio show air eastern standard time dure show caller clearli call lynch kill white peopl minut clip radio show heard it provid breitbart texa someon would like refer hannib he alreadi receiv death threat result interrupt fyf confer call an unidentifi black man said mother ker start ing like u bunch ni er takin one u roll he said caus alreadi roll gang anyway there six seven black mother cker see white person lynch as let turn tabl they conspir cop start lose peopl state emerg he specul one two thing would happen big as war ni er go start backin we alreadi get kill got lose sunshin could heard say yep true that king true he said we need turn tabl our kid get shot somebodi need becom sacrific side he said everybodi whatev like say everybodi differ posit war he continu becaus give anyway he said we might well util turn tabl er he said way start lookin like havin mani casualti causal side instead they kill black peopl black live matter mother ker got make matter find mother ker alon snap as hang damn tree take pictur send mother ker we need one exampl peopl start watchin thi turn tabl said he said start trickl effect he said one white person hung flat hang start trickl effect he continu black peopl good start trend he said get upper hand anoth black man spoke say need kill cop kill u the first black male said that best method right breitbart texa previous report sunshin upset racist white peopl infiltr disrupt one confer call she subsequ releas phone number one infiltr the veteran immedi start receiv threaten call one yoflag movement support allegedli told veteran infiltr publicli post confer call we go rape gut pregnant wife ing piec sh unborn creatur hung tree breitbart texa previous encount sunshin sandra bland protest waller counti jail texa said white peopl kill she told journalist photograph you see nappi as hair head that mean one milit negro she said protest redneck mother ker murder sandra bland nappi hair like fyf black radic say hold imperi power actual respons terrorist attack septemb th account day report breitbart texa there sever websit twitter handl movement palmetto star describ one head organ he said youtub video support burn symbol illus superior fals white supremaci like american flag british flag polic uniform ku klux klan hood sierra mcgrone nocturnu libertu post help young afrikan clean rag oppress she post two photo one appear photo black man wipe nake butt american flag for entir stori breitbart new | 1 |
| 2 | unbeliev obama attorney gener say most charlott rioter were peac protest in her home state of north carolina video now demonstr gather last night exercis constitut protect right peac protest order rais issu creat chang loretta lynch aka eric holder skirt | 1 |
| 3 | bobbi jindal rais hindu use stori christian convers woo evangel potenti bida dozen polit activ pastor came privat dinner friday night hear convers stori uniqu context presidenti polit louisiana gov bobbi jindal travel hinduism protest christian ultim becam call evangel cathol over two hour jindal recal talk girl high school want save soul read bibl closet parent would see feel stir watch movi senior year depict jesu cross struck struck hard jindal told pastor thi son god he die sin jindal session christian clergi lead congreg earli presidenti battleground state iowa south carolina part behind scene effort louisiana governor find polit base could help propel top tier republican candid seek run white hous known gop circl mostli masteri polici issu health care jindal rhode scholar graduat ivi leagu brown univers obviou pool activist support help drive excit outsid home state so har religi experi way begun appeal part gop influenti core religi conserv mani yet find favorit among republican eye presidenti race other potenti gop candid woo evangel base includ sen rand paul ky ted cruz tex indiana gov mike penc but weekend lynchburg mecca sort evangel home liberti univers found rev jerri falwel jindal appear make progress in addit dinner pastor deliv well receiv call action address christian conserv gather liberti commenc ceremoni talk faith assail said presid obama record attack religi liberti the pastor came meet jindal said intim descript experi stood he convict take commun said brad sherman solid rock christian church coralvil iowa sherman help former arkansa governor mike huckabe win campaign deleg iowa anoth huckabe admir rev mitchel brook second baptist church belton said jindal commit christian valu compel stori put par huckabe baptist preacher enter polit the visit pastor flew lynchburg weekend invit american renew project well fund nonprofit group encourag evangel christian engag civic arena voter guid get vote drive program train pastor grass root activ the group founder david lane built pastor network polit import state iowa missouri ohio south carolina led trip israel paul other seek make inroad evangel activist the group lane invit lynchburg includ donald wild mon retir minist founder american famili associ promin evangel activist group influenc network christian radio station most pastor lane organ brought lynchburg met jindal but said captur interest recent step forward defend phil robertson patriarch duck dynasti televis show famili amid controversi disparag remark made gay interview gq magazin throughout lynchburg visit jindal present will cultur warrior dure commenc address saturday took caus twin brother whose hgtv realiti seri renov resel hous flip it forward cancel last week web site reveal protest sex marriag democrat nation convent charlott the sibl jason david benham liberti graduat attend graduat privat lunch jindal call action anoth demonstr intoler entertain industri if guy protest republican parti convent instead cancel show hgtv would probabl given rais jindal said liberti crowd applaud he cite hobbi lobbi craft store chain face legal challeng refus provid employe insur coverag contracept requir afford care act member famili own hobbi lobbi becom hero mani religi conserv said moral oppos use certain type birth control consid requir violat first amend right religi freedom the famili commit honor lord gener employ pay well minimum wage increas salari four year row even midst endur recess jindal told liberti graduat none matter obama administr but pastor came see jindal action governor stori highlight weekend and mani way unlik aspir presid activist met piyush jindal born four month parent arriv baton roug la nativ india he chang name bobbi young boy adopt name charact favorit televis show the bradi bunch hi decis becom christian told pastor come one moment lightn epiphani instead said happen phase grow small seed plant time jindal recal closest friend grade school gave bibl name emblazon gold cover christma present it struck initi unimpress gift jindal told pastor who world would spend good money bibl everyon know get one free hotel recal think time and gold letter meant give away return hi religi educ reach higher plane junior year high school told dinner audienc he want ask pretti girl date hallway convers start talk faith god opposit abort the girl invit visit church jindal said skeptic set investig fanci claim made girl friend he start read bibl closet home unsur parent would react said after stir moment saw christ depict cross religi movi bibl exist suddenli seem clearer jindal told pastor jindal dwell subsequ convers catholic year later colleg said immers tradit church he touch briefli commenc address note pas best describ evangel cathol mostli sought showcas way share valu christian conserv read word jesu christ realiz true jindal told graduat saturday offer le detail account convers done night pastor use think found god believ accur say he found | 0 |
| 4 | satan russia unv imag terrifi new supernuk western world take noticeth r sarmat missil dub satan replac s fli mile km per sec rang mile km the weapon perceiv part increasingli aggress russia it could deliv warhead megaton time power atom bomb drop hiroshima nagasaki by libbi plummer gareth davi russia unveil chill pictur largest ever nuclear missil capabl destroy area size franc the r sarmat missil dub satan nato top speed mile km per second design outfox anti missil shield system the new sarmat missil could deliv warhead megaton time power atom bomb drop hiroshima nagasaki scroll video russian presid vladimir putin reportedli plan replac countri older s satan weapon new missil amid string recent disagr west the kremlin step rhetor west carri seri manoeuvr infuri politician u uk the pictur reveal onlin chief design makeyev rocket design bureau messag post alongsid pictur said in accord decre russian govern on state defens order plan period makeyev rocket design bureau instruct start design develop work sarmat the r sarmat missil said contain nuclear warhead capabl destroy area size franc texa accord russian news network zvezda own russia ministri defenc the weapon also abl evad radar it expect rang mile km would allow moscow attack london for entir articl click link | 1 |
| 5 | about time christian group sue amazon splc design hate groupal say one time someon su southern poverti law center on tuesday jame kennedi ministri djkm file lawsuit southern poverti law center splc chariti navig organ guidestar amazon defam religi discrimin traffick falsehood the splc list djkm hate group guidestar also categor term amazon kept ministri chariti donat program amazon smile we embark today journey right terribl wrong dr frank wright presid ceo djkm said statement tuesday those knowingli label christian ministri hate group sole subscrib histor christian faith either woefulli uninform will deceit in case southern poverti law center lawsuit alleg latter the splc label djkm anti lgbt hate group opposit sex marriag transgender these fals illeg character chill effect free exercis religion religi free speech peopl faith wright declar after given splc opportun retract undertaken legal action seek trial juri peer preserv right law defend religi free speech right american djkm presid conclud the lawsuit laid charg splc guidestar amazon read pjm | 1 |
# Splitting the data
# no pre-processing
X = news_data['text']
Y = news_data['label']
# version 1
x1 = news_data_v1['text']
y1 = news_data_v1['label']
# version 2
x2 = news_data_v2['text']
y2 = news_data_v2['label']
# version 3
x3 = news_data_v3['text']
y3 = news_data_v3['label']
#Split data into train and test data
#no pre-processing
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,shuffle = True,random_state = 10)
#version 1
x1_train,x1_test,y1_train,y1_test=train_test_split(x1,y1,test_size=0.20,shuffle = True,random_state = 10)
#version 2
x2_train,x2_test,y2_train,y2_test=train_test_split(x2,y2,test_size=0.20,shuffle = True,random_state = 10)
#version 3
x3_train,x3_test,y3_train,y3_test=train_test_split(x3,y3,test_size=0.20,shuffle = True,random_state = 10)
For model training, I have selected various models to apply different feature extraction such as:
Feature Extraction 1: CountVectorizer
| Model | No Preprocessing | Version 1 | Version 2 | Version 3 |
|---|---|---|---|---|
| Logistic Regression | - | - | - | - |
| Random Forest Classifier | - | - | - | - |
| Multinomial Naive Bayes | - | - | - | - |
| Passive Aggressive Classifier | - | - | - | - |
| Decision Tree | - | - | - | - |
| Linear SVM | - | - | - | - |
| XGBoost | - | - | - | - |
Feature Extraction 2: TF-IDF
| Model | No Preprocessing | Version 1 | Version 2 | Version 3 |
|---|---|---|---|---|
| Logistic Regression | - | - | - | - |
| Random Forest Classifier | - | - | - | - |
| Multinomial Naive Bayes | - | - | - | - |
| Passive Aggressive Classifier | - | - | - | - |
| Decision Tree | - | - | - | - |
| Linear SVM | - | - | - | - |
| XGBoost | - | - | - | - |
For model evaluation, I have selected various metrics to evaluate each model's performance such as:
| Model Evaluation Metrics |
|---|
| Confusion Matrix |
| Accuracy |
| Recall |
| Precision |
| F1-Score |
| AUC-ROC |
# convert text into vectors by CountVectorizer
# no pre-processing
cv = CountVectorizer(max_features=100)
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)
# version 1
cv_v1 = CountVectorizer(max_features=100)
x1_train_cv = cv_v1.fit_transform(x1_train)
x1_test_cv = cv_v1.transform(x1_test)
# version 2
cv_v2 = CountVectorizer(max_features=100)
x2_train_cv = cv_v2.fit_transform(x2_train)
x2_test_cv = cv_v2.transform(x2_test)
# version 3
cv_v3 = CountVectorizer(max_features=100)
x3_train_cv = cv_v3.fit_transform(x3_train)
x3_test_cv = cv_v3.transform(x3_test)
#Logistic Regression
#no pre-processing
lr_model=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model.fit(x_train_cv,y_train)
#version 1
lr_model_v1=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model_v1.fit(x1_train_cv,y1_train)
#version 2
lr_model_v2=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model_v2.fit(x2_train_cv,y2_train)
#version 3
lr_model_v3=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model_v3.fit(x3_train_cv,y3_train)
LogisticRegression(max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000)
#Predict lr model with no pre-processing version
y_pred_lr = lr_model.predict(x_test_cv)
print('Accuracy of Logistic Regression model with no pre-processing version by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_lr,y_test)*100))
#Predict lr model with version 1
y_pred_lr_v1 = lr_model_v1.predict(x1_test_cv)
print('Accuracy of Logistic Regression model with version 1 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_lr_v1,y1_test)*100))
#Predict lr model with version 2
y_pred_lr_v2 = lr_model_v2.predict(x2_test_cv)
print('Accuracy of Logistic Regression model with version 2 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_lr_v2,y2_test)*100))
#Predict lr model with version 3
y_pred_lr_v3 = lr_model_v3.predict(x3_test_cv)
print('Accuracy of Logistic Regression model with version 3 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_lr_v3,y3_test)*100))
Accuracy of Logistic Regression model with no pre-processing version by using CountVectorizer: 83.62% Accuracy of Logistic Regression model with version 1 by using CountVectorizer: 83.63% Accuracy of Logistic Regression model with version 2 by using CountVectorizer: 88.99% Accuracy of Logistic Regression model with version 3 by using CountVectorizer: 88.85%
#Confusion Matrix(no pre-processing version)
cm = confusion_matrix(y_test, y_pred_lr)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_lr_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_lr_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_lr_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_lr,y_test))
precision recall f1-score support
0 0.80 0.85 0.83 6601
1 0.87 0.82 0.84 7707
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.84 0.84 0.84 14308
#version 1
print(classification_report(y_pred_lr_v1,y1_test))
precision recall f1-score support
0 0.80 0.85 0.83 6603
1 0.87 0.82 0.84 7705
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.84 0.84 0.84 14308
#version 2
print(classification_report(y_pred_lr_v2,y2_test))
precision recall f1-score support
0 0.83 0.94 0.88 6206
1 0.95 0.85 0.90 8102
accuracy 0.89 14308
macro avg 0.89 0.90 0.89 14308
weighted avg 0.90 0.89 0.89 14308
#version 3
print(classification_report(y_pred_lr_v3,y3_test))
precision recall f1-score support
0 0.83 0.94 0.88 6185
1 0.95 0.85 0.90 8123
accuracy 0.89 14308
macro avg 0.89 0.89 0.89 14308
weighted avg 0.90 0.89 0.89 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_lr)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_lr)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.8354561411853383
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_lr_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_lr_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.8355992641279459
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_lr_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_lr_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.888586804533883
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_lr_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_lr_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.8870840136365021
#Random Forest Classifier
#no pre-processing
rf_model = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model.fit(x_train_cv,y_train)
#version 1
rf_model_v1 = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model_v1.fit(x1_train_cv,y1_train)
#version 2
rf_model_v2 = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model_v2.fit(x2_train_cv,y2_train)
#version 3
rf_model_v3 = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model_v3.fit(x3_train_cv,y3_train)
RandomForestClassifier(criterion='entropy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(criterion='entropy')
#Predict rf model with no pre-processing version
y_pred_rf = rf_model.predict(x_test_cv)
print('Accuracy of Random Forest Classifier with no pre-processing version by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_rf,y_test)*100))
#Predict rf model with version 1
y_pred_rf_v1 = rf_model_v1.predict(x1_test_cv)
print('Accuracy of Random Forest Classifier with version 1 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_rf_v1,y1_test)*100))
#Predict rf model with version 2
y_pred_rf_v2 = rf_model_v2.predict(x2_test_cv)
print('Accuracy of Random Forest Classifier with version 2 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_rf_v2,y2_test)*100))
#Predict rf model with version 3
y_pred_rf_v3 = rf_model_v3.predict(x3_test_cv)
print('Accuracy of Random Forest Classifier with version 3 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_rf_v3,y3_test)*100))
Accuracy of Random Forest Classifier with no pre-processing version by using CountVectorizer: 90.37% Accuracy of Random Forest Classifier with version 1 by using CountVectorizer: 90.49% Accuracy of Random Forest Classifier with version 2 by using CountVectorizer: 90.96% Accuracy of Random Forest Classifier with version 3 by using CountVectorizer: 90.78%
#Confusion Matrix(no pre-processing version)
cm = confusion_matrix(y_test, y_pred_rf)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_rf_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_rf_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_rf_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_rf,y_test))
precision recall f1-score support
0 0.87 0.93 0.90 6557
1 0.94 0.88 0.91 7751
accuracy 0.90 14308
macro avg 0.90 0.91 0.90 14308
weighted avg 0.91 0.90 0.90 14308
#version 1
print(classification_report(y_pred_rf_v1,y1_test))
precision recall f1-score support
0 0.87 0.93 0.90 6570
1 0.94 0.89 0.91 7738
accuracy 0.90 14308
macro avg 0.90 0.91 0.90 14308
weighted avg 0.91 0.90 0.91 14308
#version 2
print(classification_report(y_pred_rf_v2,y2_test))
precision recall f1-score support
0 0.84 0.97 0.90 6105
1 0.97 0.87 0.92 8203
accuracy 0.91 14308
macro avg 0.91 0.92 0.91 14308
weighted avg 0.92 0.91 0.91 14308
#version 3
print(classification_report(y_pred_rf_v3,y3_test))
precision recall f1-score support
0 0.84 0.96 0.90 6118
1 0.97 0.87 0.91 8190
accuracy 0.91 14308
macro avg 0.91 0.91 0.91 14308
weighted avg 0.92 0.91 0.91 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_rf)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_rf)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.9029358032509767
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_rf_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_rf_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.9041458186820971
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_rf_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_rf_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.9080720029804604
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_rf_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_rf_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.9063449972177925
#Multinomial Naive Bayes
#no pre-processing
mnb_model = MultinomialNB()
mnb_model.fit(x_train_cv,y_train)
#version 1
mnb_model_v1 = MultinomialNB()
mnb_model_v1.fit(x1_train_cv,y1_train)
#version 2
mnb_model_v2 = MultinomialNB()
mnb_model_v2.fit(x2_train_cv,y2_train)
#version 3
mnb_model_v3 = MultinomialNB()
mnb_model_v3.fit(x3_train_cv,y3_train)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()
#Predict mnb model with no pre-processing version
y_pred_mnb = mnb_model.predict(x_test_cv)
print('Accuracy of Multinomial Naive Bayes with no pre-processing version by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_mnb,y_test)*100))
#Predict mnb model with version 1
y_pred_mnb_v1 = mnb_model_v1.predict(x1_test_cv)
print('Accuracy of Multinomial Naive Bayes with version 1 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_mnb_v1,y1_test)*100))
#Predict mnb model with version 2
y_pred_mnb_v2 = mnb_model_v2.predict(x2_test_cv)
print('Accuracy of Multinomial Naive Bayes with version 2 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_mnb_v2,y2_test)*100))
#Predict mnb model with version 3
y_pred_mnb_v3 = mnb_model_v3.predict(x3_test_cv)
print('Accuracy of Multinomial Naive Bayes with version 3 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_mnb_v3,y3_test)*100))
Accuracy of Multinomial Naive Bayes with no pre-processing version by using CountVectorizer: 81.77% Accuracy of Multinomial Naive Bayes with version 1 by using CountVectorizer: 81.79% Accuracy of Multinomial Naive Bayes with version 2 by using CountVectorizer: 84.34% Accuracy of Multinomial Naive Bayes with version 3 by using CountVectorizer: 83.90%
#Confusion Matrix(no pre-processing version)
cm = confusion_matrix(y_test, y_pred_mnb)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_mnb_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_mnb_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_mnb_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_mnb,y_test))
precision recall f1-score support
0 0.85 0.79 0.82 7518
1 0.79 0.85 0.82 6790
accuracy 0.82 14308
macro avg 0.82 0.82 0.82 14308
weighted avg 0.82 0.82 0.82 14308
#version 1
print(classification_report(y_pred_mnb_v1,y1_test))
precision recall f1-score support
0 0.85 0.79 0.82 7527
1 0.79 0.85 0.82 6781
accuracy 0.82 14308
macro avg 0.82 0.82 0.82 14308
weighted avg 0.82 0.82 0.82 14308
#version 2
print(classification_report(y_pred_mnb_v2,y2_test))
precision recall f1-score support
0 0.86 0.83 0.84 7230
1 0.83 0.86 0.84 7078
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.84 0.84 0.84 14308
#version 3
print(classification_report(y_pred_mnb_v3,y3_test))
precision recall f1-score support
0 0.85 0.82 0.84 7211
1 0.83 0.85 0.84 7097
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.84 0.84 0.84 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_mnb)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_mnb)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.8184218424886368
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_mnb_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_mnb_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.818646321274116
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_mnb_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_mnb_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.8436856126370619
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_mnb_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_mnb_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.8392490653364151
#Passive Aggressive Classifier
#no pre-processing
pac_model = PassiveAggressiveClassifier(max_iter=200)
pac_model.fit(x_train_cv,y_train)
#version 1
pac_model_v1 = PassiveAggressiveClassifier(max_iter=200)
pac_model_v1.fit(x1_train_cv,y1_train)
#version 2
pac_model_v2 = PassiveAggressiveClassifier(max_iter=200)
pac_model_v2.fit(x2_train_cv,y2_train)
#version 3
pac_model_v3 = PassiveAggressiveClassifier(max_iter=200)
pac_model_v3.fit(x3_train_cv,y3_train)
PassiveAggressiveClassifier(max_iter=200)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PassiveAggressiveClassifier(max_iter=200)
#Predict pac model with no pre-processing version
y_pred_pac = pac_model.predict(x_test_cv)
print('Accuracy of Passive Aggressive Classifier with no pre-processing version by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_pac,y_test)*100))
#Predict pac model with version 1
y_pred_pac_v1 = pac_model_v1.predict(x1_test_cv)
print('Accuracy of Passive Aggressive Classifier with version 1 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_pac_v1,y1_test)*100))
#Predict pac model with version 2
y_pred_pac_v2 = pac_model_v2.predict(x2_test_cv)
print('Accuracy of Passive Aggressive Classifier with version 2 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_pac_v2,y2_test)*100))
#Predict pac model with version 3
y_pred_pac_v3 = pac_model_v3.predict(x3_test_cv)
print('Accuracy of Passive Aggressive Classifier with version 3 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_pac_v3,y3_test)*100))
Accuracy of Passive Aggressive Classifier with no pre-processing version by using CountVectorizer: 79.51% Accuracy of Passive Aggressive Classifier with version 1 by using CountVectorizer: 76.94% Accuracy of Passive Aggressive Classifier with version 2 by using CountVectorizer: 81.65% Accuracy of Passive Aggressive Classifier with version 3 by using CountVectorizer: 80.25%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_pac)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_pac_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_pac_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_pac_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_pac,y_test))
precision recall f1-score support
0 0.84 0.76 0.80 7736
1 0.75 0.83 0.79 6572
accuracy 0.80 14308
macro avg 0.80 0.80 0.79 14308
weighted avg 0.80 0.80 0.80 14308
#version 1
print(classification_report(y_pred_pac_v1,y1_test))
precision recall f1-score support
0 0.85 0.72 0.78 8217
1 0.69 0.83 0.75 6091
accuracy 0.77 14308
macro avg 0.77 0.78 0.77 14308
weighted avg 0.78 0.77 0.77 14308
#version 2
print(classification_report(y_pred_pac_v2,y2_test))
precision recall f1-score support
0 0.77 0.84 0.80 6388
1 0.86 0.80 0.83 7920
accuracy 0.82 14308
macro avg 0.82 0.82 0.82 14308
weighted avg 0.82 0.82 0.82 14308
#version 3
print(classification_report(y_pred_pac_v3,y3_test))
precision recall f1-score support
0 0.78 0.81 0.79 6741
1 0.82 0.80 0.81 7567
accuracy 0.80 14308
macro avg 0.80 0.80 0.80 14308
weighted avg 0.80 0.80 0.80 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_pac)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_pac)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.7962605421698818
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_pac_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_pac_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.771241895230839
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_pac_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_pac_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.8154583706267229
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_pac_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_pac_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.8019788618693914
#Decision Tree Classifier
#no pre-processing
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train_cv,y_train)
#version 1
dt_model_v1 = DecisionTreeClassifier()
dt_model_v1.fit(x1_train_cv,y1_train)
#version 2
dt_model_v2 = DecisionTreeClassifier()
dt_model_v2.fit(x2_train_cv,y2_train)
#version 3
dt_model_v3 = DecisionTreeClassifier()
dt_model_v3.fit(x3_train_cv,y3_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
#Predict dt model with no pre-processing version
y_pred_dt = dt_model.predict(x_test_cv)
print('Accuracy of Decision Tree Classifier with no pre-processing version by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_dt,y_test)*100))
#Predict dt model with version 1
y_pred_dt_v1 = dt_model_v1.predict(x1_test_cv)
print('Accuracy of Decision Tree Classifier with version 1 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_dt_v1,y1_test)*100))
#Predict dt model with version 2
y_pred_dt_v2 = dt_model_v2.predict(x2_test_cv)
print('Accuracy of Decision Tree Classifier with version 2 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_dt_v2,y2_test)*100))
#Predict dt model with version 3
y_pred_dt_v3 = dt_model_v3.predict(x3_test_cv)
print('Accuracy of Decision TreeClassifier with version 3 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_dt_v3,y3_test)*100))
Accuracy of Decision Tree Classifier with no pre-processing version by using CountVectorizer: 83.66% Accuracy of Decision Tree Classifier with version 1 by using CountVectorizer: 83.65% Accuracy of Decision Tree Classifier with version 2 by using CountVectorizer: 86.75% Accuracy of Decision TreeClassifier with version 3 by using CountVectorizer: 86.48%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_dt)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_dt_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_dt_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_dt_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_dt,y_test))
precision recall f1-score support
0 0.81 0.85 0.83 6685
1 0.86 0.83 0.84 7623
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.84 0.84 0.84 14308
#version 1
print(classification_report(y_pred_dt_v1,y1_test))
precision recall f1-score support
0 0.81 0.85 0.83 6667
1 0.86 0.83 0.84 7641
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.84 0.84 0.84 14308
#version 2
print(classification_report(y_pred_dt_v2,y2_test))
precision recall f1-score support
0 0.85 0.88 0.86 6761
1 0.89 0.86 0.87 7547
accuracy 0.87 14308
macro avg 0.87 0.87 0.87 14308
weighted avg 0.87 0.87 0.87 14308
#version 3
print(classification_report(y_pred_dt_v3,y3_test))
precision recall f1-score support
0 0.85 0.87 0.86 6807
1 0.88 0.86 0.87 7501
accuracy 0.86 14308
macro avg 0.86 0.87 0.86 14308
weighted avg 0.87 0.86 0.86 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_dt)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_dt)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.8360128368435402
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_dt_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_dt_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.8358435955767524
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_dt_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_dt_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.8670455504942179
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_dt_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_dt_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.8644633357866182
#Linear SVM
#no pre-processing
svm_model = LinearSVC(max_iter=10000)
svm_model.fit(x_train_cv,y_train)
#version 1
svm_model_v1 = LinearSVC(max_iter=10000)
svm_model_v1.fit(x1_train_cv,y1_train)
#version 2
svm_model_v2 = LinearSVC(max_iter=10000)
svm_model_v2.fit(x2_train_cv,y2_train)
#version 3
svm_model_v3 = LinearSVC(max_iter=10000)
svm_model_v3.fit(x3_train_cv,y3_train)
LinearSVC(max_iter=10000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearSVC(max_iter=10000)
#Predict linear SVM model with no pre-processing version
y_pred_svm = svm_model.predict(x_test_cv)
print('Accuracy of Linear SVM with no pre-processing version by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_svm,y_test)*100))
#Predict linear SVM model with version 1
y_pred_svm_v1 = svm_model_v1.predict(x1_test_cv)
print('Accuracy of Linear SVM with version 1 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_svm_v1,y1_test)*100))
#Predict linear SVM model with version 2
y_pred_svm_v2 = svm_model_v2.predict(x2_test_cv)
print('Accuracy of Linear SVM with version 2 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_svm_v2,y2_test)*100))
#Predict linear SVM model with version 3
y_pred_svm_v3 = svm_model_v3.predict(x3_test_cv)
print('Accuracy of Linear SVM with version 3 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_svm_v3,y3_test)*100))
Accuracy of Linear SVM with no pre-processing version by using CountVectorizer: 83.97% Accuracy of Linear SVM with version 1 by using CountVectorizer: 83.97% Accuracy of Linear SVM with version 2 by using CountVectorizer: 88.88% Accuracy of Linear SVM with version 3 by using CountVectorizer: 88.73%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_svm)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_svm_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_svm_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_svm_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_svm,y_test))
precision recall f1-score support
0 0.78 0.88 0.83 6222
1 0.90 0.81 0.85 8086
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.85 0.84 0.84 14308
#version 1
print(classification_report(y_pred_svm_v1,y1_test))
precision recall f1-score support
0 0.80 0.86 0.83 6473
1 0.88 0.82 0.85 7835
accuracy 0.84 14308
macro avg 0.84 0.84 0.84 14308
weighted avg 0.84 0.84 0.84 14308
#version 2
print(classification_report(y_pred_svm_v2,y2_test))
precision recall f1-score support
0 0.83 0.94 0.88 6134
1 0.95 0.85 0.90 8174
accuracy 0.89 14308
macro avg 0.89 0.90 0.89 14308
weighted avg 0.90 0.89 0.89 14308
#version 3
print(classification_report(y_pred_svm_v3,y3_test))
precision recall f1-score support
0 0.82 0.94 0.88 6114
1 0.95 0.85 0.90 8194
accuracy 0.89 14308
macro avg 0.89 0.89 0.89 14308
weighted avg 0.90 0.89 0.89 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_svm)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_svm)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.8384038462594894
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_svm_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_svm_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.8387436483940252
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_svm_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_svm_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.8873504068583904
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_svm_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_svm_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.8857793192802282
#XGBoost
#no pre-processing
xgb_model = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model.fit(x_train_cv,y_train)
#version 1
xgb_model_v1 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model_v1.fit(x1_train_cv,y1_train)
#version 2
xgb_model_v2 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model_v2.fit(x2_train_cv,y2_train)
#version 3
xgb_model_v3 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model_v3.fit(x3_train_cv,y3_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)#Predict XGBoost model with no pre-processing version
y_pred_xgb = xgb_model.predict(x_test_cv)
print('Accuracy of XGBoost with no pre-processing version by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_xgb,y_test)*100))
#Predict XGBoost model with version 1
y_pred_xgb_v1 = xgb_model_v1.predict(x1_test_cv)
print('Accuracy of XGBoost with version 1 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_xgb_v1,y1_test)*100))
#Predict XGBoost model with version 2
y_pred_xgb_v2 = xgb_model_v2.predict(x2_test_cv)
print('Accuracy of XGBoost with version 2 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_xgb_v2,y2_test)*100))
#Predict XGBoost model with version 3
y_pred_xgb_v3 = xgb_model_v3.predict(x3_test_cv)
print('Accuracy of XGBoost with version 3 by using CountVectorizer: {:.2f}%'.format(accuracy_score(y_pred_xgb_v3,y3_test)*100))
Accuracy of XGBoost with no pre-processing version by using CountVectorizer: 89.82% Accuracy of XGBoost with version 1 by using CountVectorizer: 89.87% Accuracy of XGBoost with version 2 by using CountVectorizer: 91.04% Accuracy of XGBoost with version 3 by using CountVectorizer: 90.92%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_xgb)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_xgb_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_xgb_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_xgb_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_xgb,y_test))
precision recall f1-score support
0 0.88 0.91 0.89 6756
1 0.92 0.89 0.90 7552
accuracy 0.90 14308
macro avg 0.90 0.90 0.90 14308
weighted avg 0.90 0.90 0.90 14308
#version 1
print(classification_report(y_pred_xgb_v1,y1_test))
precision recall f1-score support
0 0.88 0.91 0.89 6746
1 0.92 0.89 0.90 7562
accuracy 0.90 14308
macro avg 0.90 0.90 0.90 14308
weighted avg 0.90 0.90 0.90 14308
#version 2
print(classification_report(y_pred_xgb_v2,y2_test))
precision recall f1-score support
0 0.86 0.95 0.90 6319
1 0.96 0.88 0.92 7989
accuracy 0.91 14308
macro avg 0.91 0.91 0.91 14308
weighted avg 0.91 0.91 0.91 14308
#version 3
print(classification_report(y_pred_xgb_v3,y3_test))
precision recall f1-score support
0 0.86 0.95 0.90 6296
1 0.96 0.88 0.92 8012
accuracy 0.91 14308
macro avg 0.91 0.91 0.91 14308
weighted avg 0.91 0.91 0.91 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_xgb)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_xgb)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.8977362529006052
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_xgb_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_xgb_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.898279361556333
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_xgb_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_xgb_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.9092604844788827
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_xgb_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_xgb_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.9080341450951498
# convert text into vectors by TF-IDF
# no pre-processing
tf=TfidfVectorizer()
x_train_vect = tf.fit_transform(x_train)
x_test_vect = tf.transform(x_test)
# version 1
tf_v1=TfidfVectorizer()
x1_train_vect = tf_v1.fit_transform(x1_train)
x1_test_vect = tf_v1.transform(x1_test)
# version 2
tf_v2=TfidfVectorizer()
x2_train_vect = tf_v2.fit_transform(x2_train)
x2_test_vect = tf_v2.transform(x2_test)
# version 3
tf_v3=TfidfVectorizer()
x3_train_vect = tf_v3.fit_transform(x3_train)
x3_test_vect = tf_v3.transform(x3_test)
#Logistic Regression
#no pre-processing
lr_model=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model.fit(x_train_vect,y_train)
#version 1
lr_model_v1=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model_v1.fit(x1_train_vect,y1_train)
#version 2
lr_model_v2=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model_v2.fit(x2_train_vect,y2_train)
#version 3
lr_model_v3=LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model_v3.fit(x3_train_vect,y3_train)
LogisticRegression(max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000)
#Predict lr model with no pre-processing version
y_pred_lr = lr_model.predict(x_test_vect)
print('Accuracy of Logistic Regression model with no pre-processing version by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_lr,y_test)*100))
#Predict lr model with version 1
y_pred_lr_v1 = lr_model_v1.predict(x1_test_vect)
print('Accuracy of Logistic Regression model with version 1 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_lr_v1,y1_test)*100))
#Predict lr model with version 2
y_pred_lr_v2 = lr_model_v2.predict(x2_test_vect)
print('Accuracy of Logistic Regression model with version 2 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_lr_v2,y2_test)*100))
#Predict lr model with version 3
y_pred_lr_v3 = lr_model_v3.predict(x3_test_vect)
print('Accuracy of Logistic Regression model with version 3 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_lr_v3,y3_test)*100))
Accuracy of Logistic Regression model with no pre-processing version by using TF-IDF: 94.98% Accuracy of Logistic Regression model with version 1 by using TF-IDF: 94.82% Accuracy of Logistic Regression model with version 2 by using TF-IDF: 95.18% Accuracy of Logistic Regression model with version 3 by using TF-IDF: 95.22%
#Confusion Matrix(no pre-processing version)
cm = confusion_matrix(y_test, y_pred_lr)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_lr_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_lr_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_lr_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Logistic Regression(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_lr,y_test))
precision recall f1-score support
0 0.94 0.96 0.95 6859
1 0.96 0.94 0.95 7449
accuracy 0.95 14308
macro avg 0.95 0.95 0.95 14308
weighted avg 0.95 0.95 0.95 14308
#version 1
print(classification_report(y_pred_lr_v1,y1_test))
precision recall f1-score support
0 0.94 0.96 0.95 6848
1 0.96 0.94 0.95 7460
accuracy 0.95 14308
macro avg 0.95 0.95 0.95 14308
weighted avg 0.95 0.95 0.95 14308
#version 2
print(classification_report(y_pred_lr_v2,y2_test))
precision recall f1-score support
0 0.94 0.96 0.95 6891
1 0.96 0.95 0.95 7417
accuracy 0.95 14308
macro avg 0.95 0.95 0.95 14308
weighted avg 0.95 0.95 0.95 14308
#version 3
print(classification_report(y_pred_lr_v3,y3_test))
precision recall f1-score support
0 0.94 0.96 0.95 6885
1 0.96 0.95 0.95 7423
accuracy 0.95 14308
macro avg 0.95 0.95 0.95 14308
weighted avg 0.95 0.95 0.95 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_lr)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_lr)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.9495819768079838
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_lr_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_lr_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.9479556517111304
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_lr_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_lr_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.9515922275855367
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_lr_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_lr_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.9520020076702246
#Random Forest Classifier
#no pre-processing
rf_model = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model.fit(x_train_vect,y_train)
#version 1
rf_model_v1 = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model_v1.fit(x1_train_vect,y1_train)
#version 2
rf_model_v2 = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model_v2.fit(x2_train_vect,y2_train)
#version 3
rf_model_v3 = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_model_v3.fit(x3_train_vect,y3_train)
RandomForestClassifier(criterion='entropy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(criterion='entropy')
#Predict rf model with no pre-processing version
y_pred_rf = rf_model.predict(x_test_vect)
print('Accuracy of Random Forest Classifier with no pre-processing version by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_rf,y_test)*100))
#Predict rf model with version 1
y_pred_rf_v1 = rf_model_v1.predict(x1_test_vect)
print('Accuracy of Random Forest Classifier with version 1 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_rf_v1,y1_test)*100))
#Predict rf model with version 2
y_pred_rf_v2 = rf_model_v2.predict(x2_test_vect)
print('Accuracy of Random Forest Classifier with version 2 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_rf_v2,y2_test)*100))
#Predict rf model with version 3
y_pred_rf_v3 = rf_model_v3.predict(x3_test_vect)
print('Accuracy of Random Forest Classifier with version 3 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_rf_v3,y3_test)*100))
Accuracy of Random Forest Classifier with no pre-processing version by using TF-IDF: 93.28% Accuracy of Random Forest Classifier with version 1 by using TF-IDF: 93.02% Accuracy of Random Forest Classifier with version 2 by using TF-IDF: 93.27% Accuracy of Random Forest Classifier with version 3 by using TF-IDF: 93.12%
#Confusion Matrix(no pre-processing version)
cm = confusion_matrix(y_test, y_pred_rf)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_rf_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_rf_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_rf_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Random Forest(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_rf,y_test))
precision recall f1-score support
0 0.92 0.95 0.93 6763
1 0.95 0.92 0.94 7545
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#version 1
print(classification_report(y_pred_rf_v1,y1_test))
precision recall f1-score support
0 0.91 0.94 0.93 6754
1 0.95 0.92 0.93 7554
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#version 2
print(classification_report(y_pred_rf_v2,y2_test))
precision recall f1-score support
0 0.91 0.95 0.93 6750
1 0.95 0.92 0.94 7558
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#version 3
print(classification_report(y_pred_rf_v3,y3_test))
precision recall f1-score support
0 0.92 0.94 0.93 6787
1 0.95 0.92 0.93 7521
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_rf)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_rf)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.932362572308512
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_rf_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_rf_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.9297605049375851
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_rf_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_rf_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.932271422094073
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_rf_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_rf_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.9308633101218456
#Multinomial Naive Bayes
#no pre-processing
mnb_model = MultinomialNB()
mnb_model.fit(x_train_vect,y_train)
#version 1
mnb_model_v1 = MultinomialNB()
mnb_model_v1.fit(x1_train_vect,y1_train)
#version 2
mnb_model_v2 = MultinomialNB()
mnb_model_v2.fit(x2_train_vect,y2_train)
#version 3
mnb_model_v3 = MultinomialNB()
mnb_model_v3.fit(x3_train_vect,y3_train)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()
#Predict mnb model with no pre-processing version
y_pred_mnb = mnb_model.predict(x_test_vect)
print('Accuracy of Multinomial Naive Bayes with no pre-processing version by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_mnb,y_test)*100))
#Predict mnb model with version 1
y_pred_mnb_v1 = mnb_model_v1.predict(x1_test_vect)
print('Accuracy of Multinomial Naive Bayes with version 1 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_mnb_v1,y1_test)*100))
#Predict mnb model with version 2
y_pred_mnb_v2 = mnb_model_v2.predict(x2_test_vect)
print('Accuracy of Multinomial Naive Bayes with version 2 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_mnb_v2,y2_test)*100))
#Predict mnb model with version 3
y_pred_mnb_v3 = mnb_model_v3.predict(x3_test_vect)
print('Accuracy of Multinomial Naive Bayes with version 3 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_mnb_v3,y3_test)*100))
Accuracy of Multinomial Naive Bayes with no pre-processing version by using TF-IDF: 86.30% Accuracy of Multinomial Naive Bayes with version 1 by using TF-IDF: 86.01% Accuracy of Multinomial Naive Bayes with version 2 by using TF-IDF: 86.54% Accuracy of Multinomial Naive Bayes with version 3 by using TF-IDF: 86.30%
#Confusion Matrix(no pre-processing version)
cm = confusion_matrix(y_test, y_pred_mnb)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_mnb_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_mnb_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_mnb_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Multinomial Naive Bayes(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_mnb,y_test))
precision recall f1-score support
0 0.85 0.86 0.86 6897
1 0.87 0.86 0.87 7411
accuracy 0.86 14308
macro avg 0.86 0.86 0.86 14308
weighted avg 0.86 0.86 0.86 14308
#version 1
print(classification_report(y_pred_mnb_v1,y1_test))
precision recall f1-score support
0 0.84 0.87 0.85 6782
1 0.88 0.85 0.87 7526
accuracy 0.86 14308
macro avg 0.86 0.86 0.86 14308
weighted avg 0.86 0.86 0.86 14308
#version 2
print(classification_report(y_pred_mnb_v2,y2_test))
precision recall f1-score support
0 0.85 0.87 0.86 6757
1 0.88 0.86 0.87 7551
accuracy 0.87 14308
macro avg 0.86 0.87 0.87 14308
weighted avg 0.87 0.87 0.87 14308
#version 3
print(classification_report(y_pred_mnb_v3,y3_test))
precision recall f1-score support
0 0.84 0.87 0.86 6735
1 0.88 0.85 0.87 7573
accuracy 0.86 14308
macro avg 0.86 0.86 0.86 14308
weighted avg 0.86 0.86 0.86 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_mnb)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_mnb)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.862792095383025
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_mnb_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_mnb_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.8597372778102335
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_mnb_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_mnb_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.8649411486318953
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_mnb_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_mnb_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.8625276473506998
#Passive Aggressive Classifier
#no pre-processing
pac_model = PassiveAggressiveClassifier(max_iter=200)
pac_model.fit(x_train_vect,y_train)
#version 1
pac_model_v1 = PassiveAggressiveClassifier(max_iter=200)
pac_model_v1.fit(x1_train_vect,y1_train)
#version 2
pac_model_v2 = PassiveAggressiveClassifier(max_iter=200)
pac_model_v2.fit(x2_train_vect,y2_train)
#version 3
pac_model_v3 = PassiveAggressiveClassifier(max_iter=200)
pac_model_v3.fit(x3_train_vect,y3_train)
PassiveAggressiveClassifier(max_iter=200)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PassiveAggressiveClassifier(max_iter=200)
#Predict pac model with no pre-processing version
y_pred_pac = pac_model.predict(x_test_vect)
print('Accuracy of Passive Aggressive Classifier with no pre-processing version by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_pac,y_test)*100))
#Predict pac model with version 1
y_pred_pac_v1 = pac_model_v1.predict(x1_test_vect)
print('Accuracy of Passive Aggressive Classifier with version 1 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_pac_v1,y1_test)*100))
#Predict pac model with version 2
y_pred_pac_v2 = pac_model_v2.predict(x2_test_vect)
print('Accuracy of Passive Aggressive Classifier with version 2 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_pac_v2,y2_test)*100))
#Predict pac model with version 3
y_pred_pac_v3 = pac_model_v3.predict(x3_test_vect)
print('Accuracy of Passive Aggressive Classifier with version 3 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_pac_v3,y3_test)*100))
Accuracy of Passive Aggressive Classifier with no pre-processing version by using TF-IDF: 96.79% Accuracy of Passive Aggressive Classifier with version 1 by using TF-IDF: 96.62% Accuracy of Passive Aggressive Classifier with version 2 by using TF-IDF: 96.34% Accuracy of Passive Aggressive Classifier with version 3 by using TF-IDF: 96.36%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_pac)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_pac_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_pac_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_pac_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Passive Aggressive Classifier(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_pac,y_test))
precision recall f1-score support
0 0.96 0.97 0.97 6917
1 0.97 0.96 0.97 7391
accuracy 0.97 14308
macro avg 0.97 0.97 0.97 14308
weighted avg 0.97 0.97 0.97 14308
#version 1
print(classification_report(y_pred_pac_v1,y1_test))
precision recall f1-score support
0 0.96 0.97 0.97 6915
1 0.97 0.96 0.97 7393
accuracy 0.97 14308
macro avg 0.97 0.97 0.97 14308
weighted avg 0.97 0.97 0.97 14308
#version 2
print(classification_report(y_pred_pac_v2,y2_test))
precision recall f1-score support
0 0.96 0.97 0.96 6890
1 0.97 0.96 0.96 7418
accuracy 0.96 14308
macro avg 0.96 0.96 0.96 14308
weighted avg 0.96 0.96 0.96 14308
#version 3
print(classification_report(y_pred_pac_v3,y3_test))
precision recall f1-score support
0 0.96 0.97 0.96 6892
1 0.97 0.96 0.96 7416
accuracy 0.96 14308
macro avg 0.96 0.96 0.96 14308
weighted avg 0.96 0.96 0.96 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_pac)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_pac)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.9677183573521234
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_pac_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_pac_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.966036794736579
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_pac_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_pac_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.9632687508893867
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_pac_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_pac_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.9634118738319944
#Decision Tree Classifier
#no pre-processing
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train_vect,y_train)
#version 1
dt_model_v1 = DecisionTreeClassifier()
dt_model_v1.fit(x1_train_vect,y1_train)
#version 2
dt_model_v2 = DecisionTreeClassifier()
dt_model_v2.fit(x2_train_vect,y2_train)
#version 3
dt_model_v3 = DecisionTreeClassifier()
dt_model_v3.fit(x3_train_vect,y3_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
#Predict dt model with no pre-processing version
y_pred_dt = dt_model.predict(x_test_vect)
print('Accuracy of Decision Tree Classifier with no pre-processing version by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_dt,y_test)*100))
#Predict dt model with version 1
y_pred_dt_v1 = dt_model_v1.predict(x1_test_vect)
print('Accuracy of Decision Tree Classifier with version 1 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_dt_v1,y1_test)*100))
#Predict dt model with version 2
y_pred_dt_v2 = dt_model_v2.predict(x2_test_vect)
print('Accuracy of Decision Tree Classifier with version 2 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_dt_v2,y2_test)*100))
#Predict dt model with version 3
y_pred_dt_v3 = dt_model_v3.predict(x3_test_vect)
print('Accuracy of Decision TreeClassifier with version 3 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_dt_v3,y3_test)*100))
Accuracy of Decision Tree Classifier with no pre-processing version by using TF-IDF: 93.01% Accuracy of Decision Tree Classifier with version 1 by using TF-IDF: 93.16% Accuracy of Decision Tree Classifier with version 2 by using TF-IDF: 93.21% Accuracy of Decision TreeClassifier with version 3 by using TF-IDF: 93.24%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_dt)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_dt_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_dt_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_dt_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Decision Tree(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_dt,y_test))
precision recall f1-score support
0 0.91 0.94 0.93 6765
1 0.95 0.92 0.93 7543
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#version 1
print(classification_report(y_pred_dt_v1,y1_test))
precision recall f1-score support
0 0.92 0.94 0.93 6833
1 0.94 0.92 0.93 7475
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#version 2
print(classification_report(y_pred_dt_v2,y2_test))
precision recall f1-score support
0 0.92 0.94 0.93 6883
1 0.94 0.93 0.93 7425
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#version 3
print(classification_report(y_pred_dt_v3,y3_test))
precision recall f1-score support
0 0.92 0.94 0.93 6810
1 0.95 0.92 0.93 7498
accuracy 0.93 14308
macro avg 0.93 0.93 0.93 14308
weighted avg 0.93 0.93 0.93 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_dt)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_dt)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.9297085322094165
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_dt_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_dt_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.9313579747601195
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_dt_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_dt_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.9318591689794384
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_dt_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_dt_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.9320896495055788
#Linear SVM
#no pre-processing
svm_model = LinearSVC(max_iter=10000)
svm_model.fit(x_train_vect,y_train)
#version 1
svm_model_v1 = LinearSVC(max_iter=10000)
svm_model_v1.fit(x1_train_vect,y1_train)
#version 2
svm_model_v2 = LinearSVC(max_iter=10000)
svm_model_v2.fit(x2_train_vect,y2_train)
#version 3
svm_model_v3 = LinearSVC(max_iter=10000)
svm_model_v3.fit(x3_train_vect,y3_train)
LinearSVC(max_iter=10000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearSVC(max_iter=10000)
#Predict linear SVM model with no pre-processing version
y_pred_svm = svm_model.predict(x_test_vect)
print('Accuracy of Linear SVM with no pre-processing version by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_svm,y_test)*100))
#Predict linear SVM model with version 1
y_pred_svm_v1 = svm_model_v1.predict(x1_test_vect)
print('Accuracy of Linear SVM with version 1 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_svm_v1,y1_test)*100))
#Predict linear SVM model with version 2
y_pred_svm_v2 = svm_model_v2.predict(x2_test_vect)
print('Accuracy of Linear SVM with version 2 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_svm_v2,y2_test)*100))
#Predict linear SVM model with version 3
y_pred_svm_v3 = svm_model_v3.predict(x3_test_vect)
print('Accuracy of Linear SVM with version 3 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_svm_v3,y3_test)*100))
Accuracy of Linear SVM with no pre-processing version by using TF-IDF: 97.02% Accuracy of Linear SVM with version 1 by using TF-IDF: 96.82% Accuracy of Linear SVM with version 2 by using TF-IDF: 96.75% Accuracy of Linear SVM with version 3 by using TF-IDF: 96.75%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_svm)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_svm_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_svm_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_svm_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - Linear SVM(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_svm,y_test))
precision recall f1-score support
0 0.96 0.98 0.97 6884
1 0.98 0.96 0.97 7424
accuracy 0.97 14308
macro avg 0.97 0.97 0.97 14308
weighted avg 0.97 0.97 0.97 14308
#version 1
print(classification_report(y_pred_svm_v1,y1_test))
precision recall f1-score support
0 0.96 0.97 0.97 6878
1 0.98 0.96 0.97 7430
accuracy 0.97 14308
macro avg 0.97 0.97 0.97 14308
weighted avg 0.97 0.97 0.97 14308
#version 2
print(classification_report(y_pred_svm_v2,y2_test))
precision recall f1-score support
0 0.96 0.97 0.97 6904
1 0.97 0.96 0.97 7404
accuracy 0.97 14308
macro avg 0.97 0.97 0.97 14308
weighted avg 0.97 0.97 0.97 14308
#version 3
print(classification_report(y_pred_svm_v3,y3_test))
precision recall f1-score support
0 0.96 0.97 0.97 6900
1 0.97 0.96 0.97 7408
accuracy 0.97 14308
macro avg 0.97 0.97 0.97 14308
weighted avg 0.97 0.97 0.97 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_svm)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_svm)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.9699721478179067
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_svm_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_svm_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.9680043393171469
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_svm_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_svm_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.9673474908335141
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_svm_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_svm_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.967340961252469
#XGBoost
#no pre-processing
xgb_model = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model.fit(x_train_vect,y_train)
#version 1
xgb_model_v1 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model_v1.fit(x1_train_vect,y1_train)
#version 2
xgb_model_v2 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model_v2.fit(x2_train_vect,y2_train)
#version 3
xgb_model_v3 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1)
xgb_model_v3.fit(x3_train_vect,y3_train)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)#Predict XGBoost model with no pre-processing version
y_pred_xgb = xgb_model.predict(x_test_vect)
print('Accuracy of XGBoost with no pre-processing version by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_xgb,y_test)*100))
#Predict XGBoost model with version 1
y_pred_xgb_v1 = xgb_model_v1.predict(x1_test_vect)
print('Accuracy of XGBoost with version 1 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_xgb_v1,y1_test)*100))
#Predict XGBoost model with version 2
y_pred_xgb_v2 = xgb_model_v2.predict(x2_test_vect)
print('Accuracy of XGBoost with version 2 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_xgb_v2,y2_test)*100))
#Predict XGBoost model with version 3
y_pred_xgb_v3 = xgb_model_v3.predict(x3_test_vect)
print('Accuracy of XGBoost with version 3 by using TF-IDF: {:.2f}%'.format(accuracy_score(y_pred_xgb_v3,y3_test)*100))
Accuracy of XGBoost with no pre-processing version by using TF-IDF: 96.46% Accuracy of XGBoost with version 1 by using TF-IDF: 96.43% Accuracy of XGBoost with version 2 by using TF-IDF: 96.48% Accuracy of XGBoost with version 3 by using TF-IDF: 96.31%
#Confusion Matrix(no preprocessing)
cm = confusion_matrix(y_test, y_pred_xgb)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(no preprocessing)')
plt.show()
#Confusion Matrix(version 1)
cm = confusion_matrix(y1_test, y_pred_xgb_v1)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(version 1)')
plt.show()
#Confusion Matrix(version 2)
cm = confusion_matrix(y2_test, y_pred_xgb_v2)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(version 2)')
plt.show()
#Confusion Matrix(version 3)
cm = confusion_matrix(y3_test, y_pred_xgb_v3)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[True,False])
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix - XGBoost(version 3)')
plt.show()
#no pre-processing version
print(classification_report(y_pred_xgb,y_test))
precision recall f1-score support
0 0.95 0.98 0.96 6778
1 0.98 0.95 0.97 7530
accuracy 0.96 14308
macro avg 0.96 0.97 0.96 14308
weighted avg 0.97 0.96 0.96 14308
#version 1
print(classification_report(y_pred_xgb_v1,y1_test))
precision recall f1-score support
0 0.95 0.98 0.96 6780
1 0.98 0.95 0.97 7528
accuracy 0.96 14308
macro avg 0.96 0.96 0.96 14308
weighted avg 0.96 0.96 0.96 14308
#version 2
print(classification_report(y_pred_xgb_v2,y2_test))
precision recall f1-score support
0 0.95 0.98 0.96 6802
1 0.98 0.95 0.97 7506
accuracy 0.96 14308
macro avg 0.96 0.97 0.96 14308
weighted avg 0.97 0.96 0.96 14308
#version 3
print(classification_report(y_pred_xgb_v3,y3_test))
precision recall f1-score support
0 0.95 0.97 0.96 6819
1 0.98 0.95 0.96 7489
accuracy 0.96 14308
macro avg 0.96 0.96 0.96 14308
weighted avg 0.96 0.96 0.96 14308
#no pre-processing version
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_xgb)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y_test, y_pred_xgb)
plt.title('ROC Curve (no prerprocessing)')
plt.show()
AUC-ROC score: 0.9642047878368059
#version 1
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y1_test, y_pred_xgb_v1)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y1_test, y_pred_xgb_v1)
plt.title('ROC Curve (version 1)')
plt.show()
AUC-ROC score: 0.9639283363231581
#version 2
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y2_test, y_pred_xgb_v2)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y2_test, y_pred_xgb_v2)
plt.title('ROC Curve (version 2)')
plt.show()
AUC-ROC score: 0.9645236816272467
#version 3
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y3_test, y_pred_xgb_v3)
print('AUC-ROC score:', auc_roc)
# Plot the ROC curve
roc_display = RocCurveDisplay.from_predictions(y3_test, y_pred_xgb_v3)
plt.title('ROC Curve (version 3)')
plt.show()
AUC-ROC score: 0.9628032054456237
def predict(model, x_pred):
"""
Predicts the label of the news article using the given model and input vector.
"""
prediction = model.predict(x_pred)
return prediction[0]
# Create a list of all the models
models = [lr_model, rf_model, mnb_model, pac_model, dt_model, svm_model, xgb_model]
# Choose a test article to predict
x_pred_news = x_test_vect[6]
# Loop through all the models and print their predictions
for i, model in enumerate(models):
prediction = predict(model, x_pred_news)
model_name = type(model).__name__
if prediction == 1:
print('Label of the news article is [1] real news')
else:
print('Label of the news article is [0] fake news')
print(f'Prediction from {model_name} is: {prediction}')
print('--------------------------------------------------')
Label of the news article is [1] real news Prediction from LogisticRegression is: 1 -------------------------------------------------- Label of the news article is [1] real news Prediction from RandomForestClassifier is: 1 -------------------------------------------------- Label of the news article is [1] real news Prediction from MultinomialNB is: 1 -------------------------------------------------- Label of the news article is [1] real news Prediction from PassiveAggressiveClassifier is: 1 -------------------------------------------------- Label of the news article is [1] real news Prediction from DecisionTreeClassifier is: 1 -------------------------------------------------- Label of the news article is [1] real news Prediction from LinearSVC is: 1 -------------------------------------------------- Label of the news article is [1] real news Prediction from XGBClassifier is: 1 --------------------------------------------------